Title level 1

Title level 2

bold

italics

Load packages

library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
source("functions.R")

I downloaded the file and loaded it into R

download.file("https://raw.githubusercontent.com/swcarpentry/r-novice-gapminder/gh-pages/_episodes_rmd/data/gapminder-FiveYearData.csv", destfile = "data/gapminder-FiveYearData.csv")

gapminder <- read.csv("data/gapminder-FiveYearData.csv")

head(gapminder)
##       country year      pop continent lifeExp gdpPercap
## 1 Afghanistan 1952  8425333      Asia  28.801  779.4453
## 2 Afghanistan 1957  9240934      Asia  30.332  820.8530
## 3 Afghanistan 1962 10267083      Asia  31.997  853.1007
## 4 Afghanistan 1967 11537966      Asia  34.020  836.1971
## 5 Afghanistan 1972 13079460      Asia  36.088  739.9811
## 6 Afghanistan 1977 14880372      Asia  38.438  786.1134

I wonder if rstats increases life expectancy over the years

p <- ggplot(data=gapminder,aes(x=year,y=lifeExp)) +
    geom_point()

p

Let’s see the interactive version

ggplotly(p)

Making your own functions

If you are repeating yourself in your code, you may be able to solve that problem by making your own function!

cars <- c(3,4,5,6,7,10)

se(cars)
## [1] 1.013794

Data manipulation with dplyr

You will likely want to get subsections of your dataframe and/or calculate means of a variable for a certain subsection, dplyr is your friend!

Explored select

gapminder <- read.csv("data/gapminder-FiveYearData.csv")

year_country_gdp <- select(gapminder,year, country, gdpPercap)
year_country_gdp <- select(gapminder,-pop, -continent, -lifeExp)
names(year_country_gdp)
## [1] "country"   "year"      "gdpPercap"

Explore filter

euro <- filter(gapminder,continent=="Europe")
year_country_gdp_euro <- select(euro,year, country, gdpPercap)

year_country_gdp_euro <- gapminder  %>% 
    filter(continent=="Europe") %>% 
    select(year, country, gdpPercap)

exploring the amazing group_by and summarize functions

mean_gdp_percountry <- gapminder %>% 
    group_by(country) %>% 
    summarise(mean_gdp=mean(gdpPercap),
              se_gdp=se(gdpPercap))

mean_gdp_percountry
## # A tibble: 142 x 3
##        country   mean_gdp     se_gdp
##         <fctr>      <dbl>      <dbl>
##  1 Afghanistan   802.6746   31.23550
##  2     Albania  3255.3666  344.20223
##  3     Algeria  4426.0260  378.26190
##  4      Angola  3607.1005  336.56641
##  5   Argentina  8955.5538  537.68144
##  6   Australia 19980.5956 2256.11315
##  7     Austria 20411.9163 2787.23968
##  8     Bahrain 18077.6639 1563.29518
##  9  Bangladesh   817.5588   67.86165
## 10     Belgium 19900.7581 2422.32683
## # ... with 132 more rows

Challenge: I want the mean, se, and sample size of life expetancy by continent

mean_se_life_percontinent<-gapminder %>% 
  group_by(continent,country) %>% 
  summarise(mean_life=mean(lifeExp),
            se_life=se(lifeExp),
            samsize_life=n())

mean_se_life_percontinent
## # A tibble: 142 x 5
## # Groups:   continent [?]
##    continent                  country mean_life   se_life samsize_life
##       <fctr>                   <fctr>     <dbl>     <dbl>        <int>
##  1    Africa                  Algeria  59.03017 2.9849208           12
##  2    Africa                   Angola  37.88350 1.1562236           12
##  3    Africa                    Benin  48.77992 1.7691977           12
##  4    Africa                 Botswana  54.59750 1.7116922           12
##  5    Africa             Burkina Faso  44.69400 1.9762099           12
##  6    Africa                  Burundi  44.81733 0.9165096           12
##  7    Africa                 Cameroon  48.12850 1.5784640           12
##  8    Africa Central African Republic  43.86692 1.3627459           12
##  9    Africa                     Chad  46.77358 1.4110376           12
## 10    Africa                  Comoros  52.38175 2.3476081           12
## # ... with 132 more rows

combining ggplot and dplyr

euro_countries <- gapminder %>% 
    filter(continent=="Europe") %>% 
    ggplot(aes(x=year,y=lifeExp,color=country)) +
    geom_line()+
    facet_wrap(~country)

euro_countries

ggsave("euro.png")
## Saving 7 x 5 in image
write.csv(mean_gdp_percountry,"processed/mean_gdp_percountry.csv")

Data manipulation with tidyr

# command to download the 'wide' data
download.file("https://raw.githubusercontent.com/swcarpentry/r-novice-gapminder/gh-pages/data/gapminder_wide.csv", destfile = "data/gapminder_wide.csv")

gapminder_wide <- read.csv("data/gapminder_wide.csv")

gap_long <- gapminder_wide %>% 
    gather(obstype_year,
           obs_values,
           3:38)

head(gap_long)
##   continent      country   obstype_year obs_values
## 1    Africa      Algeria gdpPercap_1952  2449.0082
## 2    Africa       Angola gdpPercap_1952  3520.6103
## 3    Africa        Benin gdpPercap_1952  1062.7522
## 4    Africa     Botswana gdpPercap_1952   851.2411
## 5    Africa Burkina Faso gdpPercap_1952   543.2552
## 6    Africa      Burundi gdpPercap_1952   339.2965

separate the obs_type column

gap_normal <- gap_long %>% 
    separate(obstype_year,into=c("obs_type","year"),sep="_") %>% 
    spread(obs_type,obs_values)

head(gap_normal)
##   continent country year gdpPercap lifeExp      pop
## 1    Africa Algeria 1952  2449.008  43.077  9279525
## 2    Africa Algeria 1957  3013.976  45.685 10270856
## 3    Africa Algeria 1962  2550.817  48.303 11000948
## 4    Africa Algeria 1967  3246.992  51.407 12760499
## 5    Africa Algeria 1972  4182.664  54.518 14760787
## 6    Africa Algeria 1977  4910.417  58.014 17152804
all.equal(gapminder,gap_normal)
##  [1] "Names: 5 string mismatches"                                                                            
##  [2] "Component 1: Attributes: < Component \"levels\": Lengths (142, 5) differ (string compare on first 5) >"
##  [3] "Component 1: Attributes: < Component \"levels\": 5 string mismatches >"                                
##  [4] "Component 1: 1704 string mismatches"                                                                   
##  [5] "Component 2: Attributes: < target is NULL, current is list >"                                          
##  [6] "Component 2: target is numeric, current is factor"                                                     
##  [7] "Component 3: Modes: numeric, character"                                                                
##  [8] "Component 3: target is numeric, current is character"                                                  
##  [9] "Component 4: 'current' is not a factor"                                                                
## [10] "Component \"lifeExp\": Mean relative difference: 0.203822"                                             
## [11] "Component 6: Mean relative difference: 4101.546"
gap_normal <- gap_normal %>% 
    arrange(country,continent,year)
all.equal(gapminder,gap_normal)
##  [1] "Names: 5 string mismatches"                                                                            
##  [2] "Component 1: Attributes: < Component \"levels\": Lengths (142, 5) differ (string compare on first 5) >"
##  [3] "Component 1: Attributes: < Component \"levels\": 5 string mismatches >"                                
##  [4] "Component 1: 1704 string mismatches"                                                                   
##  [5] "Component 2: Attributes: < target is NULL, current is list >"                                          
##  [6] "Component 2: target is numeric, current is factor"                                                     
##  [7] "Component 3: Modes: numeric, character"                                                                
##  [8] "Component 3: target is numeric, current is character"                                                  
##  [9] "Component 4: 'current' is not a factor"                                                                
## [10] "Component 6: Mean relative difference: 4101.546"